import pandas as pd
from matplotlib import pyplot as plt
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_validate
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
path = '../../data_lake/twindle/supervised'
train = pd.read_csv(f'{path}/train.csv')
test = pd.read_csv(f'{path}/test.csv')
train = train.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
test = test.replace([np.inf, -np.inf], np.nan).dropna(axis=0)
train.head()
df = train.append(test)
targets = ['co2(t+1)', 'co2(t+2)', 'co2(t+3)', 'co2(t+4)', 'co2(t+5)']
features = ['co2(t-3)', 'humidity(t-3)', 'temperature(t-3)', 'pressure(t-3)', 'activity(t-3)', 'illumination(t-3)', 'co2(t-2)', 'humidity(t-2)',
'temperature(t-2)', 'pressure(t-2)', 'activity(t-2)', 'illumination(t-2)', 'co2(t-1)', 'humidity(t-1)', 'temperature(t-1)','pressure(t-1)',
'activity(t-1)', 'illumination(t-1)', 'co2', 'humidity', 'temperature', 'pressure', 'activity', 'illumination', 'co2_mean_change', 'humidity_mean_change', 'temperature_mean_change','pressure_mean_change', 'activity_mean_change', 'illumination_mean_change']
selector = SelectKBest(f_regression)
scaler = StandardScaler()
param_grid = {
'selector__k': range(3, len(features))
}
pipelines = [
(Pipeline([('selector', selector), ('scaler', scaler), ('model', Ridge())]), 'RR'),
# (Pipeline([('selector', selector), ('scaler', scaler), ('model', DecisionTreeRegressor())]), 'DTR'),
# (Pipeline([('selector', selector), ('scaler', scaler), ('model', RandomForestRegressor())]), 'RF'),
# (Pipeline([('selector', selector), ('scaler', scaler), ('model', MLPRegressor())]), 'MLP')
]
results = []
scores = []
for target in targets:
for pipeline, abbr in pipelines:
search = GridSearchCV(
estimator=pipeline,
param_grid=param_grid,
n_jobs=-1,
scoring='neg_mean_squared_error',
cv=5,
verbose=1
)
X = train[features]
y = train[target]
search_result = search.fit(X, y)
pipeline = search_result.best_estimator_
selected_features = []
selector = pipeline.named_steps['selector']
for i, feature in enumerate(features):
if i in selector.get_support(indices=True):
selected_features.append(feature)
results.append((abbr, pipeline, selected_features))
scores.append((abbr, selector.get_support(), selector.scores_))
Fitting 5 folds for each of 27 candidates, totalling 135 fits Fitting 5 folds for each of 27 candidates, totalling 135 fits Fitting 5 folds for each of 27 candidates, totalling 135 fits Fitting 5 folds for each of 27 candidates, totalling 135 fits Fitting 5 folds for each of 27 candidates, totalling 135 fits
fig, axes = plt.subplots(5, 1, figsize=(12, 10))
for ax, (abbr, selected_features, selector_scores) in zip(axes.flatten(), scores):
temp = pd.DataFrame({'feature': features,'selected_feature': selected_features, 'score': selector_scores}).sort_values(by='score')
temp = temp[temp.selected_feature == True]
temp.score = (temp.score - temp.score.min()) / (temp.score.max() - temp.score.min())
ax.barh(temp.feature, temp.score)
ax.set_title(f'{abbr}')
ax.set_xlabel('Normalized score')
fig.tight_layout()
abbreviations = []
predictions_list = []
y_true = test[targets]
for abbr, pipeline, selected_features in results:
X = test[selected_features]
new_pipeline = Pipeline(steps=[pipeline.steps[1], pipeline.steps[2]])
predictions = new_pipeline.predict(X)
abbreviations.append(abbr)
predictions_list.append(predictions)
def visualize(abbreviations, predictions_list, suptitle):
n_cols = len(abbreviations)
n_rows = 3
fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 4, n_rows * 4))
fig.suptitle(suptitle)
row_index = 0
for abbr, predictions, ax in zip(abbreviations, predictions_list, axes[row_index].flatten()):
ax.scatter(y_true, predictions)
ax.set_title(f'{abbr} - Predicted vs. Actual')
ax.set_xlabel('Actual')
ax.set_ylabel('Predicted')
row_index += 1
r2s, rmses, maes = [], [], []
for predictions in predictions_list:
r2s.append(r2_score(y_true, predictions))
rmses.append(mean_squared_error(y_true, predictions, squared=False))
maes.append(mean_absolute_error(y_true, predictions))
scores_df = pd.DataFrame({'model': abbreviations, 'R2': r2s, 'Root Mean Squared Error': rmses, 'Mean Absolute Error': maes})
for score_name, ax in zip(scores_df.columns[1:], axes[row_index].flatten()):
x = scores_df[[score_name, 'model']].sort_values(by=score_name, ascending=True)
ax.bar(x.model, x[score_name])
ax.set_title(score_name)
ax.set_ylabel('Score')
row_index += 1
for abbr, predictions, ax in zip(abbreviations, predictions_list, axes[row_index].flatten()):
residuals = predictions - y_true
ax.hist(residuals)
ax.set_title(f'{abbr} - Residuals Distribution')
ax.set_ylabel('Count')
ax.set_xlabel('Residual')
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
return scores_df
locf = test['co2(t-1)']
predictions_list.insert(0, locf)
abbreviations.insert(0, 'Baseline')
scores_df = visualize(abbreviations, predictions_list, 'Model Evaluations')
--------------------------------------------------------------------------- ValueError Traceback (most recent call last) <ipython-input-57-463a35865c59> in <module> ----> 1 scores_df = visualize(abbreviations, predictions_list, 'Model Evaluations') <ipython-input-55-f43466d0b80e> in visualize(abbreviations, predictions_list, suptitle) 8 9 for abbr, predictions, ax in zip(abbreviations, predictions_list, axes[row_index].flatten()): ---> 10 ax.scatter(y_true, predictions) 11 ax.set_title(f'{abbr} - Predicted vs. Actual') 12 ~\anaconda3\lib\site-packages\matplotlib\__init__.py in inner(ax, data, *args, **kwargs) 1541 def inner(ax, *args, data=None, **kwargs): 1542 if data is None: -> 1543 return func(ax, *map(sanitize_sequence, args), **kwargs) 1544 1545 bound = new_sig.bind(ax, *args, **kwargs) ~\anaconda3\lib\site-packages\matplotlib\cbook\deprecation.py in wrapper(*args, **kwargs) 356 f"%(removal)s. If any parameter follows {name!r}, they " 357 f"should be pass as keyword, not positionally.") --> 358 return func(*args, **kwargs) 359 360 return wrapper ~\anaconda3\lib\site-packages\matplotlib\axes\_axes.py in scatter(self, x, y, s, c, marker, cmap, norm, vmin, vmax, alpha, linewidths, verts, edgecolors, plotnonfinite, **kwargs) 4378 y = np.ma.ravel(y) 4379 if x.size != y.size: -> 4380 raise ValueError("x and y must be the same size") 4381 4382 if s is None: ValueError: x and y must be the same size
scores_df
fig, axes = plt.subplots(1, 5, figsize=(20, 4))
for predictions, target, ax in zip(predictions_list, targets, axes.flatten()):
y_true = test[target]
r2 = r2_score(y_true, predictions)
rmse = mean_squared_error(y_true, predictions, squared=False)
ax.set_title(f'{target}. \nr2: {r2:.3f}, rmse: {rmse:.3f}')
ax.scatter(y_true, predictions)
ax.set_ylim(df.co2.min(), df.co2.max())
ax.set_xlim(df.co2.min(), df.co2.max())
ax.plot([0, 1], [0, 1], '--', transform=ax.transAxes, color='r')